from openai import OpenAI

# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://192.168.50.109:8000/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)


def print_sync():
    chat_response = client.chat.completions.create(
        model="Qwen2.5-1.5B-Instruct",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "你是谁."},
        ]
    )
    print("Chat response:", chat_response.choices[0].message.content)


def print_stream():
    chat_response = client.chat.completions.create(
        model="Qwen2.5-1.5B-Instruct",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "写一个300字的故事."},
        ],
        stream=True,
    )
    # 处理流式响应
    for chunk in chat_response:
        if hasattr(chunk.choices[0].delta, "content"):
            print(chunk.choices[0].delta.content, end='', flush=True)


if __name__ == "__main__":
    #print_sync()
    print_stream()
